---
title: "Replication of: Distinctive Voices: Political Speech, Rhetoric and the Substantive Representation of Women in European Parliaments"
author: "Jens Wäckerle and Bruno Castanho Silva"
date: \today
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning=F, message = F)
```

# Data and Packages

```{r}
load('df_analysis.RData')
```

```{r}
library(tidyverse)
library(grid)
library(gridExtra)
library(magrittr)
library(yardstick)
library(lme4)
library(texreg)
library(effects)
library(stargazer)
library(car)
library(quanteda)
```


# Manuscript

## Table 1

```{r, results = 'asis'}
tab.desc <- df.all %>%
  group_by(country_label, leg) %>%
  summarise(speeches = n(),
            sp.w = mean(female))

sum.w <- df.all %>%
  distinct(name, .keep_all = T) %>%
  group_by(country_label, leg) %>%
  summarise(n.w = mean(female))

tab.desc <- left_join(tab.desc, sum.w, by = c('country_label','leg')) %>%
  mutate(leg = car::recode(leg, "'merkel_2'='2009-13';'merkel_3'='2013-17';
                   'ahern_2'='2002-07';'ahern_3_cowen'='2007-11';'kenny_1'='2011-16';
                   'balkenende_1_2'='2002-03';'balkenende_3_4'='2003-07';
                   'balkenende_5_6'='2007-10';'rutte_1_2'='2010-12';
                   'rutte_3'='2012-17';'aznar_2'='2000-04';
                   'zapatero_1'='2004-08';'zapatero_2'='2008-11';
                   'rajoy_1'='2011-15';'persson_3'='2002-06';
                   'reinfeldt_1'='2006-10';'reinfeldt_2'='2010-14';
                   'lofven_1'='2014-18'"),
         `Legislative term` = paste(country_label, leg, sep = ' ')) %>% ungroup() %>%
  select(-one_of('country_label','leg')) %>%
  mutate(sp.w = round(sp.w, 3),
         n.w = round(n.w, 3)) %>%
  rename(Speeches = speeches, `Prop. Speeches by women` = sp.w, `Prop. Women MPs` = n.w)

tab.desc <- tab.desc[,c(4,1,2,3)]

stargazer(tab.desc, summary = F, type = 'html')

## Save to separate file
#stargazer(tab.desc, summary = F, type = 'html', out = 'tables/Table1.html')
```

## Figure 1

```{r fig.height=9}
share_plot <- function(country,letter) {
  out <- df.all %>% 
    filter(country_label == country) %>%
  mutate(female = case_when(gender == 'male' ~ 0, T ~ 1)) %>%
  group_by(policy_area_ml) %>%
  summarise(share_women = mean(female, na.rm=T)) %>%
  ggplot(aes(x = reorder(policy_area_ml, share_women), y = share_women)) + geom_point() + coord_flip() + 
  xlab("")+ ylim(0,0.6)+ ylab("Share of Speeches Given by Women")+
  theme_bw()+ ggtitle(paste(letter,country))+
  theme(plot.title = element_text(face="bold"),
        axis.title=element_text(size=8),
        strip.text.y = element_text(angle = 0, face="bold"),
        strip.background = element_blank(),
        panel.border = element_blank(), 
        axis.ticks = element_blank(),
        panel.background = element_blank(), 
        panel.spacing.x = unit(0,"line"),
        legend.position = "bottom",
        legend.title = element_blank(),
        text = element_text(family = 'Gill Sans'))
  return(out)
}

p.de <- share_plot('Germany','a)')
p.ie <- share_plot('Ireland','b)')
p.es <- share_plot('Spain','c)')
p.se <- share_plot('Sweden','d)')
p.nl <- share_plot('Netherlands','e)')

p.all <- df.all %>% 
  mutate(female = case_when(gender == 'male' ~ 0, T ~ 1)) %>%
  group_by(country_label, policy_area_ml) %>%
  summarise(share_women = mean(female, na.rm=T)) %>% ungroup() %>%
  group_by(policy_area_ml) %>%
  summarise(share_women = mean(share_women)) %>%
  ggplot(aes(x = reorder(policy_area_ml, share_women), y = share_women)) + geom_point() + coord_flip() + 
  xlab("")+ ylim(0,0.6)+ ylab("Share of Speeches Given by Women")+
  theme_bw()+ ggtitle(paste('f) All countries'))+
  theme(plot.title = element_text(face="bold"),
        axis.title=element_text(size=8),
        strip.text.y = element_text(angle = 0, face="bold"),
        strip.background = element_blank(),
        panel.border = element_blank(), 
        axis.ticks = element_blank(),
        panel.background = element_blank(), 
        panel.spacing.x = unit(0,"line"),
        legend.position = "bottom",
        legend.title = element_blank(),
        text = element_text(family = 'Gill Sans'))

grid.arrange(p.de,p.ie, p.es, p.se, p.nl, p.all, ncol = 2)

# save to separate file

#setEPS(width = 7, height = 9)
#postscript('graphs/policy_areas_women.eps')
#grid.arrange(p.de,p.ie, p.es, p.se, p.nl, p.all, ncol = 2)
#endoffile <- dev.off()
```

## Figure 2

```{r }
p.policy <- df.all %>%
  group_by(country_label) %>%
  mutate(., `Legislative Session` = as.factor(as.numeric(as.factor(leg)))) %>%
  group_by(country_label, `Legislative Session`, policy_area_ml) %>%
  summarise(gender_diff = mean(gender_diff, na.rm=T)) %>% 
  ggplot(aes(y=policy_area_ml,x=gender_diff,
                                          shape = `Legislative Session`,
                                          color = `Legislative Session`))+
  geom_point(size=2)+
  geom_vline(xintercept=0)+
  theme_bw()+
  scale_color_manual(values = c('gray10','gray30','gray50','gray70','gray90')) + 
  scale_shape_manual(values = c(15,17, 16, 17, 15)) + 
  labs(x="More Salient to Men <-> More Salient to Women",y="")+
  facet_wrap(~country_label)+
  theme(strip.background = element_rect(fill="white"),
        legend.position = 'bottom',
        text = element_text(family = 'Gill Sans'))


p.all <- 
  df.all %>%
  mutate(., country_all = 'All') %>%
  ggplot(., aes(y = policy_area_ml, x = gender_diff)) + geom_violin() + facet_wrap(~country_all) + theme_bw() + 
  geom_vline(xintercept = 0) + 
  theme(strip.background = element_rect(fill="white"),
        legend.position = 'bottom',
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        text = element_text(family = 'Gill Sans'),
        plot.margin=grid::unit(c(1.85,1.5,0,0), "mm")) + xlab(NULL) + ylab(NULL)


vp <- viewport(width = 0.2715, height = 0.433, x = .861, y = 0.38)
print(p.policy)
print(p.all, vp = vp)

# Print in separate file
#vp <- viewport(width = 0.258, height = 0.449, x = .87, y = 0.3568)
#setEPS(width = 7, height = 6)
#postscript('graphs/descriptives_eb.eps')
#print(p.policy)
#print(p.all, vp = vp)
#endoffile <- dev.off() 
```

## Figure 3

```{r fig.height = 6}
tab1 <- df.all %>%
  group_by(country_label, leg) %>%
  roc_auc(., as.factor(female), pred.boost, event_level='second') %>%
  select(., country_label, leg, .estimate) %>%
  rename(`AUC.Boost.Train` = .estimate)

tab2 <- df.all %>%
  group_by(country_label, leg) %>%
  roc_auc(., as.factor(female), pred.boost.oos, event_level='second') %>%
  select(., leg, .estimate) %>%
  rename(`AUC.Boost.OOS` = .estimate) %>%
  left_join(tab1,., by = 'leg')

tab3 <- df.all %>%
  group_by(country_label, leg) %>%
  roc_auc(., as.factor(female), pred.glmnet, event_level='second') %>%
  select(., leg, .estimate) %>%
  rename(`AUC.Ridge.Train` = .estimate) %>%
  left_join(tab2,., by = 'leg')

tab4 <- df.all %>%
  group_by(country_label, leg) %>%
  roc_auc(., as.factor(female), pred.oos, event_level='second') %>%
  select(., leg, .estimate) %>%
  rename(`AUC.Ridge.OOS` = .estimate) %>%
  left_join(tab3,., by = 'leg')


df.all <- mutate(df.all, pred.glmnet.bin = case_when(pred.glmnet >= 0.5 ~ 1,
                                                     T ~ 0),
                 pred.boost.bin = case_when(pred.boost >= 0.5 ~ 1,
                                            T ~ 0),
                 pred.glmnet.oos.bin = case_when(pred.oos >= 0.5 ~ 1,
                                                 T ~ 0),
                 pred.boost.oos.bin = case_when(pred.boost.oos >= 0.5 ~ 1,
                                                T ~ 0))



tab4$leg <- car::recode(tab4$leg, "'merkel_2'='2009-13';'merkel_3'='2013-17';
                   'ahern_2'='2002-07';'ahern_3_cowen'='2007-11';'kenny_1'='2011-16';
                   'balkenende_1_2'='2002-03';'balkenende_3_4'='2003-07';
                   'balkenende_5_6'='2007-10';'rutte_1_2'='2010-12';
                   'rutte_3'='2012-17';'aznar_2'='2000-04';
                   'zapatero_1'='2004-08';'zapatero_2'='2008-11';
                   'rajoy_1'='2011-15';'persson_3'='2002-06';
                   'reinfeldt_1'='2006-10';'reinfeldt_2'='2010-14';
                   'lofven_1'='2014-18'")

tab5 <- pivot_longer(tab4, cols = AUC.Boost.Train:AUC.Ridge.OOS,
                     names_to = c('metric','model','sample'),
                     values_to = 'value',
                     names_pattern = '(.*)\\.(.*)\\.(.*)')

p.pred <- tab5 %>% mutate(model_sample = paste(model, sample, sep = ' - ')) %>%
  ggplot(aes(x = value, y = leg, shape = model_sample, color = model_sample)) + 
  geom_point() + facet_grid(country_label ~ ., scales = 'free', space = 'free') +
  scale_color_manual(values = c('gray70','gray70','black','black')) + 
  scale_shape_manual(values = c(16,17,16,17)) + 
  theme_bw() + theme(legend.position = 'bottom',
                          legend.title = element_blank(),
                      strip.text.y.right = element_text(angle = 0),
                     strip.background =element_rect(fill="white")
                     ) + 
  ylab(NULL) + xlab('AUC') + xlim(c(0.5,1)) + 
  theme(text=element_text(family = 'Gill Sans'))

p.pred

## Save to separate file
#setEPS(width = 5, height = 6)
#postscript('graphs/plot_accuracy.eps')
#p.pred
#endoffile <- dev.off() 
```


## Table 2

```{r cache = T}
m1 <- lmer(pred.glmnet100 ~ year + speech.length10 + speech.length2 +
             pron.r + verb.r +  noun.r +  Informal.r +  Social.r + 
            Posemo.r + Negemo.r + Tentat.r +
             prop_area + 
             years_parl + minister + womens_minister + 
               prop_w + 
             country_label + family +
            (1 | name) + (1 | party) + 
             (1 | leg),
           data = df.all,
           REML = T)
m2 <- update(m1, . ~ . + policy_area_ml)

m3 <- update(m1, . ~ . + value_Female100 + gender_diff100, data =  subset(df.all,speech.length10 > 0.005 & female == 1))
m4 <- update(m3, . ~ . + gender_diff100*gender_diff_bi)

coef.names <- list('year' = 'Year', 
                   'speech.length10' = 'Speech length',
                   'speech.length2' = 'Speech length^2',
                   'pron.r' = 'Pronouns', 'verb.r' = 'Verbs',
                   'noun.r' = 'Nouns', 'Informal.r' = 'Informal',
                   'Social.r' = 'Social', 'Posemo.r' = 'Positive emotions',
                   'Negemo.r' = 'Negative emotions', 'Tentat.r' = 'Tentative',
                   'years_parl' = 'Years in parliament',
                   'prop_area' = 'Proportion of speeches in area',
                   'prop_w' = 'Proportion of women in legislature',
                   'minister' = 'Minister',
                   'womens_minister' = "Minister for a `female' policy area",
                 'value_Female100' = 'Absolute salience for women',
                   'gender_diff100' = 'Relative salience for women',
                     'gender_diff_bi' = 'More salient for women',
                   'gender_diff100:gender_diff_bi' = 'Relative salience * More salient for women')

```

```{r results = 'asis'}
### print model
htmlreg(l = list(m1,m2,m3,m4),
       omit.coef = c('country','policy_area_ml','family'), 
       custom.coef.map = coef.names,
       custom.model.names = c('Model 1','Model 2','Model 3','Model 4'),
       caption = 'Dependent variable: Predicted values of speech being given by a woman',
       booktabs = T,
       single.row = T)

# Save to separate file:
#wordreg(l = list(m1,m2,m3,m4),
#       omit.coef = c('country','policy_area_ml','family'), 
#       custom.coef.map = coef.names,
#       custom.model.names = c('Model 1','Model 2','Model 3','Model 4'),
#       title = 'Dependent variable: Predicted values of speech being given by a woman',
#       single.row = T, file = 'tables/Table2.doc')
```

## Figure 4

```{r fig.height = 9}
df.plot <- as.data.frame(summary(m2)$coefficients)
df.plot <- df.plot[which(grepl('policy_area', row.names(df.plot))),]
df.plot$policy_area <- Hmisc::capitalize(gsub('policy_area_ml','',row.names(df.plot),fixed=T))

p.areas <- ggplot(df.plot, aes(x = reorder(policy_area,Estimate,min),y = Estimate, ymin = Estimate-2*`Std. Error`,
                    ymax = Estimate + 2*`Std. Error`)) + geom_pointrange() + 
  geom_hline(yintercept = 0, linetype = 'dashed') + xlab(NULL) + ylab('Estimate and 95 CI') + 
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) 


## Plot party family fixed effects:
df.plot <- as.data.frame(summary(m2)$coefficients)
df.plot <- df.plot[which(grepl('family', row.names(df.plot))),]
df.plot$family <- Hmisc::capitalize(gsub('family','',row.names(df.plot),fixed=T))

p.family <- ggplot(df.plot, aes(x = reorder(family,Estimate,min),y = Estimate, ymin = Estimate-2*`Std. Error`,
                    ymax = Estimate + 2*`Std. Error`)) + geom_pointrange() + 
  geom_hline(yintercept = 0, linetype = 'dashed') + xlab(NULL) + ylab('Estimate and 95 CI') + 
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) 


grid.arrange(p.areas,p.family, ncol=1)

## Save to Figure
#setEPS(width = 7, height = 9)
#postscript('graphs/estimates_fe.eps')
#grid.arrange(p.areas,p.family, ncol=1)
#endoffile <- dev.off() 
```

## Figure 5

```{r fig.width = 5, fig.height = 5.5}
effs <- effect(term=c("gender_diff100*gender_diff_bi"),
               xlevels= list(gender_diff100=seq(round(min(df.all$gender_diff100),0),round(max(df.all$gender_diff100),0),1)),
               mod=m4)

effs_data<-as.data.frame(effs) 

effs_data <- effs_data %>%
  filter(gender_diff_bi == 1 & gender_diff100 > 0 |
           gender_diff_bi == 0 & gender_diff100 <= 0)

p.effs <- ggplot(effs_data,aes(x=gender_diff100,y=fit,ymin=lower,ymax=upper)) +
  geom_smooth(color="black", se=F)+
  geom_smooth(aes(y=upper),lty="dashed",color="black", size = 0.5, se=F)+
  geom_smooth(aes(y=lower),lty="dashed",color="black", size = 0.5, se=F)+
  #  geom_errorbar()+
  theme_minimal()+
  annotate('segment',x = 0, xend = 0, y = 72.8,yend = 79, linetype = 'dotted') + 
  labs(x=expression('Relative public opinion salience'),y="Predicted values of speech femininity by women MPs")+
  theme(panel.grid.major=element_blank(),
        panel.grid.minor=element_blank(),
        #strip.background=element_rect(fill="white"),
        strip.text = element_text( colour = "black"),
        axis.title =  element_text(color = "black"),
        axis.text = element_text(color = "black"),
        legend.background = element_rect(color = "black",
                                         fill = "white", size = 0.5, linetype = "solid")) + 
  annotate('text', x = 0, y = 79.3, label = 'Issue more salient...') + 
  annotate('text',x = -3, y = 79, label = '... for men') +
  annotate('text', x = 3, y = 79, label = '... for women')
  
  #geom_rug(data=m4@frame, mapping = aes(x = gender_diff100), col=c('black'),alpha = .005, sides='b',
  #         inherit.aes = F) + xlim(c(min(df.all$gender_diff100),max(df.all$gender_diff100)))

p.effs

# Save to other file
#setEPS(width = 5, height = 5.5)
#postscript('graphs/interaction_genderdiff.eps')
#p.effs
#endoffile <- dev.off() 
```


# Online Appendix


## Figure A.1

```{r fig.height=9}
share_area_plot <- function(country,letter) {
  out <- df.all %>% 
    filter(country_label == country) %>%
  group_by(country_label,policy_area_ml) %>%
  summarise(n = n()) %>%
    ungroup() %>%
    group_by(country_label) %>%
    mutate(share_area = n/sum(n)) %>%
  ggplot(aes(x = reorder(policy_area_ml, share_area), y = share_area)) + geom_point() + coord_flip() + 
  xlab("")+ ylim(0,0.6)+ ylab("Share of Speeches in each Area")+
  theme_bw()+ ggtitle(paste(letter,country))+
  theme(plot.title = element_text(face="bold"),
        axis.title=element_text(size=8),
        strip.text.y = element_text(angle = 0, face="bold"),
        strip.background = element_blank(),
        panel.border = element_blank(), 
        axis.ticks = element_blank(),
        panel.background = element_blank(), 
        panel.spacing.x = unit(0,"line"),
        legend.position = "bottom",
        legend.title = element_blank(),
        text = element_text(family = 'Gill Sans'))
  return(out)
}

p.de <- share_area_plot('Germany','a)')
p.ie <- share_area_plot('Ireland','b)')
p.es <- share_area_plot('Spain','c)')
p.se <- share_area_plot('Sweden','d)')
p.nl <- share_area_plot('Netherlands','e)')


grid.arrange(p.de,p.ie, p.es, p.se, p.nl,  ncol = 2)

```

## Table B.1

```{r results = 'asis', cache = T}
m1.b <- update(m1, pred.boost100 ~ .)
m2.b <- update(m2, pred.boost100 ~ .)
m3.b <- update(m3, pred.boost100 ~ .)
m4.b <- update(m4, pred.boost100 ~ .)

htmlreg(l = list(m1.b,m2.b,m3.b,m4.b),
        omit.coef = c('country','policy_area_ml','family'), 
       custom.coef.map = coef.names,
        custom.model.names = c('Model 1','Model 2','Model 3','Model 4'),
        caption = 'Dependent variable: Predicted values of speech being given by a woman -- XGboost',
        booktabs = T,
       single.row = T)
```


## Table C.1 

```{r results = 'asis'}
m1.or <- update(m1, pred.glmnet.oos100 ~ .)
m2.or <- update(m2, pred.glmnet.oos100 ~ .)
m3.or <- update(m3, pred.glmnet.oos100 ~ .)
m4.or <- update(m4, pred.glmnet.oos100 ~ .)

htmlreg(l = list(m1.or,m2.or,m3.or,m4.or),
         omit.coef = c('country','policy_area_ml','family'), 
       custom.coef.map = coef.names,
        custom.model.names = c('Model 1','Model 2','Model 3','Model 4'),
        caption = 'Dependent variable: Predicted values of speech being given by a woman -- Out-of-sample Ridge Regression',
        booktabs = T,
       single.row = T)
```

## Table C.2

```{r results = 'asis'}
m1.ob <- update(m1, pred.boost.oos100 ~ .)
m2.ob <- update(m2, pred.boost.oos100 ~ .)
m3.ob <- update(m3, pred.boost.oos100 ~ .)
m4.ob <- update(m4, pred.boost.oos100 ~ .)

htmlreg(l = list(m1.ob,m2.ob,m3.ob,m4.ob),
        omit.coef = c('country','policy_area_ml','family'), 
       custom.coef.map = coef.names,
        custom.model.names = c('Model 1','Model 2','Model 3','Model 4'),
        caption = 'Dependent variable: Out-of-sample predicted values --  XGBoost fit for all speeches in country/legislature',
        booktabs = T,
       single.row = T)
```

## Tables D.1 - D.5

Please note that for Germany, the Netherlands, and Sweden, the tables presented in the Online Appendix were automatically translated into English with deepl.com from the tables produced below.

```{r results = 'asis'}
build.table <- function(country, dfm, type) {
  temp <- df.all %>%
  filter(country_label == country) %>%
  left_join(., select(docvars(dfm), docid, debate_title), by = 'docid')

tab.temp <- temp %>% group_by(debate_title) %>%
  rename(Debate = debate_title) %>%
  summarise(Femininity = mean(pred.glmnet, na.rm=T)) %>%
  arrange(., desc(Femininity)) %>%
  top_n(8)
tab.temp

tab.temp$Femininity <- round(tab.temp$Femininity, 3)

return(stargazer::stargazer(tab.temp, rownames = F, title = paste0('Debates with Highest Average Femininity in Speeches -- ',country),
                     header = F, summary = F, type = type))
}

build.table('Ireland',dfm.ie, 'html')
build.table('Germany',dfm.de, 'html')
build.table('Netherlands',dfm.nl,'html')
build.table('Spain',dfm.es,'html')
build.table('Sweden',dfm.se,'html')
```